In [1]:
import os
# for handling dataframe
import re
import csv
import pandas as pd
import numpy as np
# for BOW
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# for wordcloud
import matplotlib.pylab as plt
from wordcloud import WordCloud
from PIL import Image
# working directory setting
work_dir = 'D:/Document/project/HYStudy/scripts'
os.chdir(work_dir)
In [2]:
raw_text = pd.read_csv('[HYStudy 17th] ex_data.csv', encoding='utf-8', names=['content'])
raw_text.head()
Out[2]:
content
0
대리점 폰 사려 꼭알아가야할점 그런것들있나 제 지금 gpro2 쓰 넘 약정 끝나 그...
1
위약4 질문 g5 제 ㅎㅇ 개통 599유 조건 그 유지 끝내 나 폰 바꾸 되 저 청...
2
가격대 중고가20만원 포함 추천 이어폰 어느정도 후보군 추려 er4 ue900 트파...
3
잠깐 기기 위약금 나오 제 g5 새기 생기 통신사 skt 유심 하려 보 g5새 lg...
4
g5 진열 되 상태 폰 못쓸거같은데 예 도색 벗기 요
In [3]:
corpus = np.array(raw_text['content'])
print(len(corpus))
print(corpus[0:3])
raw_text.tail()
8349
[ '대리점 폰 사려 꼭알아가야할점 그런것들있나 제 지금 gpro2 쓰 넘 약정 끝나 그 폰 발열 하구 베터리 따르 엇보 와이파이 접촉 불량때문 와이파이 켜 않 이참 폰 바꾸 하 노트5 g5 생각 g5 평이 너무 안좋더 노트5생 요즘 노트5 대리점 구입 얼마인가 kt쓰고있 기기 하 생각 폰 법 바뀌 나 구매 알아야할점 그런것들있나 네이버 치 노트5 정도 하 것 맞 요즘 69했을경우 음 제 대충 보기 요금 따르 공시 바뀌 요금 비싼요금제 시작 한달 쓰 바 바꾸 되 이런것들 추가적 있 대리점 요금 구매 혜택 있 블루스 그런곳 바꾸 정보 주세 사 당하 하 흑'
'위약4 질문 g5 제 ㅎㅇ 개통 599유 조건 그 유지 끝내 나 폰 바꾸 되 저 청구 위약4 3 ㅎㅇ완납 4 금액 550 사용 제외 730 이 되 ㅎㅇ 개통 g5 사용 다시 번이했 경우 위 금액만큼 제 물 맞 문의 드립 다'
'가격대 중고가20만원 포함 추천 이어폰 어느정도 후보군 추려 er4 ue900 트파 포낙 보스 h3 등 있 무선 제이버드 akg 브라 rox sbh80 정도 있나봐 추천 대브븐 유선 해주 무선 음악 리면 되는거 apt x 지원 sbh80 유닛 소리 어느정도 간음 안되 비교 분 계신 그리 g5 이번 나 리시버 o 연결 위 후보군 성능 다 올라가능건가 니 b o 성향 맞추 유닛 가리 쓰 분 계신 요']
Out[3]:
content
8344
카우붐 마지막 수령 레노 g50 amd 램6기 모델 비닐 다 안떼졌 베젤 극 기스 ...
8345
라온티앤아 타무즈 스톤 x 사용 싼 게이밍 마우스 다를봐 없 게이밍 라온티앤아 타무...
8346
만약 ㅎㅇ 핸드폰 구입 핸드폰 새 사 되 크 ㅂㅇ ㅎㅇ v10 을 구입 초 g5 나...
8347
cube t8 plus noroot 내장메모리 통합 순정 리커버리 sd 해제 cub...
8348
노트북 살 하 사야 하 모르 업무용 가지 녀 집 사용 게임 던파 가끔 하 현재 사용...
In [4]:
# except 1-letter word
## min_df: integer(frequency), float(ratio)
tf_vectorizer = CountVectorizer(min_df = 0.001, token_pattern=r'\w{2,}')
tf_corpus = tf_vectorizer.fit(corpus)
tf_bow = tf_vectorizer.fit_transform(corpus)
tf_bow
Out[4]:
<8349x3578 sparse matrix of type '<class 'numpy.int64'>'
with 193856 stored elements in Compressed Sparse Row format>
In [5]:
tfidf_vectorizer = TfidfVectorizer(min_df = 0.001, token_pattern=r'\w{2,}')
tfidf_corpus = tf_vectorizer.fit(corpus)
tfidf_bow = tfidf_vectorizer.fit_transform(corpus)
tfidf_bow
Out[5]:
<8349x3578 sparse matrix of type '<class 'numpy.float64'>'
with 193856 stored elements in Compressed Sparse Row format>
In [6]:
# check vocabulary in TDM
print(len(tf_corpus.vocabulary_))
print(len(tf_corpus.get_feature_names()))
tf_vectorizer.get_feature_names()[1000:1010]
3578
3578
Out[6]:
['대폭', '대하', '대학생', '대한민국', '대해', '대형', '대화면', '댓글', '더럽', '더불']
In [7]:
# frequency count
tf_word_sum = tf_bow.toarray().sum(axis=0)
tf_word_name = tf_corpus.get_feature_names()
tf_word_dict = {}
for i in range(len(tf_word_sum)):
tf_word_dict[tf_word_name[i]] = tf_word_sum[i]
print(tf_corpus.get_feature_names()[150:160])
print(tf_word_sum[150:160])
['g5쪽', 'g5출시', 'g5카메', 'g5하', 'g5후', 'g6', 'g7', 'g7x', 'galaxy', 'gk']
[ 9 16 11 9 9 66 11 15 54 18]
In [8]:
# word & index number
tf_word_dict
Out[8]:
{'기록': 27,
'계약서': 18,
'마시': 20,
'그래픽카드': 45,
'고민': 874,
'괜찮다': 29,
'나을': 82,
'옮기': 106,
'안타깝': 39,
'선택지': 22,
'광주': 17,
'철회': 21,
'문의': 180,
'노트북': 146,
'화이트': 82,
'유플러스': 123,
'일하': 19,
'기본': 429,
'구매자': 55,
'하이마트': 171,
'홍보': 38,
'ㅌㅋㄴ': 25,
'64gb': 27,
'ls2d': 22,
'경기': 13,
'위치': 108,
'고정이': 12,
'먼지': 59,
'메뉴': 42,
'국내': 199,
'편한': 37,
'평소': 77,
'싼맛': 10,
'손가락': 50,
'날씨': 52,
'입체감': 10,
'화웨이': 37,
'붙이': 98,
'효도': 10,
'시계': 39,
'방진': 14,
'분리형': 16,
'만나': 42,
'a7': 73,
'좋을거': 18,
'클리앙': 11,
'인터파크': 18,
'찍기': 19,
'답답': 84,
'매달': 38,
'무겁': 41,
'퀵커버': 20,
'전국': 10,
'의향': 9,
'mode': 152,
'정리': 84,
'국민': 18,
'일주일': 95,
'특별': 44,
'묻히': 10,
'웹서핑': 73,
'튼튼': 27,
'점이': 69,
'dslr': 41,
'가죽': 30,
'대부분': 138,
'높다': 14,
'작업': 44,
'예약': 143,
'진심': 16,
'사과': 27,
'칩셋': 13,
'cat6': 21,
'택배': 151,
'어둡': 119,
'노트5': 245,
'이번': 863,
'웨이즈': 69,
'저번': 34,
'특성': 15,
'지난번': 12,
'파손': 32,
'모듈빼': 9,
'채우': 32,
'kt': 460,
'단말': 31,
'사업부': 14,
'안되는거': 24,
'게다': 53,
'구해': 31,
'조립': 27,
'장기적': 10,
'거리': 65,
'탐나': 27,
'ㅂㅇ': 79,
'내야': 27,
'달라': 109,
'가입': 150,
'판매': 195,
'해상도': 110,
'되지': 105,
'통하': 114,
'가기': 45,
'이럴': 11,
'판정': 21,
'반응속도': 14,
'저가형': 19,
'정상적': 38,
'유투브': 42,
'적응': 93,
'절약': 13,
'ㄷㄷ': 123,
'학생': 14,
'인봉': 32,
'광탈': 46,
'활성화': 22,
'은거': 11,
'스마트워치': 16,
'이부': 31,
'안주': 13,
'스냅': 87,
'길이': 42,
'조사': 10,
'마구': 10,
'인해': 66,
'techholic': 9,
'다시': 637,
'싸구려': 19,
'ㅅㄷㄹ가': 11,
'감기': 12,
'대기': 58,
'하도': 37,
'하필': 12,
'가격대': 80,
'단통': 11,
'빠르': 235,
'없애': 48,
'신경': 133,
'귀찮': 118,
'절대': 69,
'자극': 11,
'이르': 56,
'편차': 12,
'사실상': 44,
'작동': 87,
'바뀌': 116,
'교품증': 185,
'한마디': 18,
'후속': 31,
'취소': 59,
'아그': 9,
'제이슨': 10,
'다운': 81,
'생김': 10,
'수채화': 29,
'중간중간': 11,
'인상': 10,
'받으': 10,
'갤수육': 15,
'내년': 22,
'운영': 19,
'audio': 14,
'정착': 13,
'모듈': 2625,
'처리': 68,
'개취': 35,
'확대': 41,
'점유율': 27,
'어머니': 125,
'현존': 16,
'물리': 51,
'서류': 23,
'방출': 13,
'못생기': 19,
'세로': 24,
'현명': 13,
'겨울': 11,
'각종': 25,
'땡기': 42,
'공간': 28,
'총평': 14,
'인간': 10,
'해주시': 27,
'생길': 40,
'사무실': 26,
'규모': 13,
'오더': 9,
'설명': 112,
'드릴': 29,
'sid2': 23,
'알아보다': 21,
'마감': 153,
'발급': 34,
'보다': 61,
'사은품': 171,
'장소': 13,
'다음날': 24,
'빠르다': 11,
'심지': 44,
'실수': 38,
'그럼': 23,
'발표': 86,
'번이': 322,
'옵션': 42,
'울산': 11,
'남지': 10,
'하단부': 48,
'광각': 530,
'하나하나': 10,
'아저씨': 13,
'하이': 19,
'각도': 15,
'보스': 11,
'매년': 10,
'갤6': 107,
'스타': 20,
'어떠': 57,
'안정감': 12,
'망작': 9,
'빠릿': 22,
'펌웨어': 27,
'일반인': 34,
'꺼지': 59,
'불가능': 62,
'쓰지': 45,
'아침': 66,
'주지': 16,
'탑재': 115,
'물품': 12,
'오늘': 994,
'전화로': 11,
'부2': 41,
'하지않': 9,
'먼저': 112,
'프렌즈': 115,
'내면': 27,
'근래': 13,
'주면': 18,
'넘사벽': 17,
'ㄹㄱㅂㅇ': 97,
'인정': 37,
'과장': 13,
'마마무': 21,
'산지': 35,
'동생': 57,
'감감무소식': 9,
'양품': 213,
'변경': 164,
'하시': 212,
'하이브리드': 13,
'a4용지': 15,
'국민카드': 31,
'edge': 40,
'케이스': 677,
'백그라운드': 13,
'이것이': 11,
'사놓': 12,
'등에': 22,
'현실': 39,
'프로세서': 20,
'g540': 17,
'각각': 34,
'qc': 69,
'중요시': 12,
'galaxy': 54,
'그정': 33,
'6in': 12,
'htc': 32,
'고정': 53,
'유리': 57,
'채택': 47,
'목적': 36,
'롤링': 28,
'삼성': 941,
'ㅅㅋ': 69,
'스크린': 42,
'욕심': 29,
'그림': 20,
'덕분': 49,
'중요': 126,
'초중반': 10,
'밤에': 35,
'지속적': 12,
'아내': 16,
'9ghz': 9,
'신경안쓰': 12,
'메모리': 94,
'모델명': 12,
'노이즈': 55,
'보다보': 34,
'인수': 11,
'고르': 62,
'나오길': 18,
'남자': 45,
'물에': 13,
'아식스': 19,
'글쓰': 22,
'고집': 17,
'반해': 16,
'이젠': 65,
'방식': 146,
'사정': 10,
'은근': 42,
'단통법': 126,
'보여주': 73,
'외관': 116,
'오고': 24,
'저장': 53,
'위하': 185,
'연락': 100,
'지우': 22,
'때까지': 9,
'아가': 17,
'대화면': 10,
'신경쓰': 79,
'커지': 24,
'서비스': 126,
'용어': 20,
'얼른': 33,
'감사': 139,
'차후': 20,
'기업': 60,
'2년정': 10,
'오해': 15,
'상상': 19,
'부무': 36,
'완납': 56,
'서랍': 13,
'업무': 15,
'해외': 105,
'올라가': 28,
'현완': 69,
'귀가': 19,
'치명적': 36,
'이거': 426,
'128gb': 10,
'하반기': 15,
'상태': 268,
'안정성': 9,
'커뮤니티': 21,
'엄청': 370,
'뒤지': 17,
'현실적': 11,
'정품': 62,
'갤s7': 91,
'url': 12,
'선보': 21,
'sd': 96,
'보류': 10,
'일일이': 11,
'114': 13,
'인터페이스': 15,
'마케팅': 89,
'이어폰': 576,
'분위기': 60,
'정확': 110,
'맨날': 22,
'저항': 16,
'유지': 182,
'원활': 11,
'팍팍': 11,
'다가오': 20,
'서비스센터': 181,
'be': 36,
'떨어지': 123,
'착한': 10,
'하루': 144,
'해결방법': 10,
'트렌드': 14,
'기왕': 10,
'극복': 12,
'네트워크': 10,
'vga': 12,
'리모콘': 14,
'현대': 18,
'단점': 233,
'폰값': 11,
'오류': 53,
'난리': 28,
'등록': 33,
'계기': 12,
'개인적': 365,
'naver': 140,
'for': 82,
'5se': 12,
'laptop': 20,
'중앙': 15,
'자체': 240,
'연결': 335,
'여론': 9,
'스토어': 11,
'중국': 78,
'앰프': 14,
'유선': 34,
'눈팅': 77,
'electronics': 56,
'완전체': 10,
'어짜피': 30,
'구경': 55,
'방탄': 12,
'베샵': 31,
'인거': 21,
'귀엽': 11,
'카드결제': 20,
'하나': 358,
'적지': 10,
'기사': 56,
'포함': 107,
'건너': 10,
'정보공유': 10,
'구글': 89,
'어려': 25,
'해당': 85,
'asrock': 10,
'잠금': 42,
'나타내': 26,
'데이트': 128,
'열흘': 15,
'지프': 40,
'대세': 21,
'나르': 211,
'허허': 22,
'노트3': 91,
'빠릿하': 12,
'겔럭시': 16,
'플레이': 65,
'sw': 9,
'발생': 175,
'지네': 11,
'미디어': 18,
'지점': 17,
'마트': 10,
'꺼리': 10,
'상단': 103,
'돌아가': 50,
'좋다': 128,
'게임': 469,
'생기': 226,
'분리': 152,
'카드': 216,
'빠지': 117,
'쌓이': 9,
'예상': 145,
'성향': 18,
'땡겨': 9,
'개철': 58,
'제시': 13,
'샀다': 21,
'능력': 14,
'aspx': 14,
'한데': 143,
'희망': 20,
'그때': 38,
'강력': 17,
'듣기': 21,
'반대쪽': 15,
'강제': 25,
'아마': 30,
'허접': 18,
'lcd': 79,
'있을까': 175,
'일정': 44,
'유일': 30,
'간지': 11,
'gpro2': 16,
'롤리팝': 26,
'주변기기': 29,
'볼땐': 10,
'수요': 13,
'로지텍': 26,
'구합': 10,
'가족': 79,
'와중': 17,
'어떨까': 20,
'원가': 14,
'흥미': 20,
'디자인과': 25,
'순위': 16,
'결론적': 23,
'이상은': 24,
'주변': 85,
'of': 89,
'검색': 193,
'599요금제': 34,
'한시': 19,
'간단': 148,
'쥐5': 26,
'확정': 21,
'어이': 10,
'더하': 44,
'직영점': 49,
'부정적': 13,
'공짜': 60,
'걸치': 9,
'고치': 17,
'기회': 38,
'측정': 61,
'버벅': 60,
'장착': 167,
'samsung': 17,
'스냅드래곤': 37,
'아니': 1532,
'버젼': 12,
'성공': 100,
'설레': 15,
'한쪽': 62,
'남기': 74,
'잡음': 25,
'한번': 124,
'이제': 468,
'정책': 146,
'lte': 122,
'유독': 19,
'저하': 16,
'버튼': 233,
'단자': 72,
'자국': 19,
'현상': 172,
'lte2': 13,
'재미': 59,
'패턴': 37,
'팬택': 31,
'걸리': 134,
'어서': 21,
'젠더': 129,
'내일': 257,
'g4': 775,
'내부': 70,
'방금': 79,
'있는곳': 10,
'차액': 17,
'좋긴한데': 11,
'우리나라': 19,
'이전': 198,
'케이블': 117,
'만료': 16,
'루머': 45,
'이정': 303,
'그나': 192,
'저작권자': 10,
'안봐': 11,
'42mm': 18,
'보조': 27,
'후기': 376,
'롯데': 17,
'ppl': 21,
'직장인': 10,
'의도': 9,
'보자': 32,
'시키': 48,
'사용성': 9,
'비디오': 22,
'아쉽': 335,
'구간': 9,
'적절': 14,
'도료': 16,
'사용하다': 46,
'못하': 468,
'여러모': 27,
'줄이': 35,
'한번더': 13,
'상승': 34,
'이후': 230,
'글쎄': 13,
'무이자': 27,
'마이크로': 34,
'싼거': 19,
'크롬': 133,
'행보': 9,
'깔리': 9,
'한손': 68,
'버벅거': 20,
'후로': 11,
'금방': 41,
'결과물': 29,
'encode': 10,
'버스': 35,
'진입': 31,
'f700s': 25,
'칭찬': 38,
'신기': 120,
'타입': 46,
'사운드': 111,
'첨부': 50,
'떠오르': 17,
'부럽': 22,
'난감': 17,
'수치': 18,
'대응': 18,
'처분': 23,
'os': 43,
'발전': 57,
'hdmi': 24,
'반영': 18,
'유사': 11,
'못봐': 9,
'고장나': 17,
'카드사': 19,
'카페': 45,
'하라': 24,
'5x': 22,
'오랜만': 88,
'이런거': 57,
'갈수': 15,
'레노버': 55,
'밧데리': 55,
'특유': 19,
'난다': 14,
'운전': 16,
'일반': 271,
'갤칠': 34,
'g7x': 15,
'말로': 24,
'사라지': 34,
'태블릿': 21,
'여행': 55,
'뜨겁': 39,
'편리': 57,
'죽이': 12,
'아이폰7': 36,
'있다': 357,
'이틀': 49,
'대상': 46,
'누나': 18,
'별차이': 22,
'compulsory': 81,
'더이': 27,
'버리': 109,
'비슷': 295,
'끄적': 10,
'위주': 51,
'알람': 21,
'소음': 16,
'내장': 147,
'여친': 19,
'넣어주': 12,
'선택': 585,
'사고': 144,
'참여': 37,
'지포': 24,
'소개': 70,
'상담': 42,
'필요': 317,
'다음주': 47,
'엄마': 25,
'아이폰se': 49,
'침수': 25,
'불구': 43,
'떨구': 29,
'음향': 34,
'사용기간': 9,
'ㅋㅌㅂㅇ': 142,
'애초': 38,
'5v': 13,
'편하': 210,
'오른쪽': 110,
'화소': 64,
'이득': 36,
'짜증나': 31,
'newsid': 12,
'보급형': 65,
'라오': 10,
'착각': 11,
'최적화': 90,
'공기계': 98,
'대구': 26,
'브랜드': 93,
'내용': 129,
'디지털': 21,
'들르': 21,
'캡쳐': 35,
'홈피': 9,
'같습': 63,
'티탄색상': 18,
'자세': 115,
'요거': 9,
'르그번': 15,
'뷰2': 19,
'출력': 70,
'53mm': 24,
'기쁘': 10,
'이면': 22,
'예의': 25,
'다녀오': 10,
'고질적': 10,
'네비': 14,
'마시멜로': 77,
'메인보드': 114,
'꽂히': 26,
'차량용': 13,
'촬영': 213,
'뽑기': 85,
'물고': 10,
'패드': 28,
'감싸': 11,
'실행': 78,
'그부분': 14,
'센스': 11,
'체험존': 40,
'잘몰': 11,
'욕먹': 18,
'기억': 83,
'호환': 112,
'놔두': 11,
'전원': 229,
'삼성꺼': 24,
'설정': 203,
'벌어지': 18,
'아이템': 14,
'스크래치': 22,
'h61m': 15,
'알루미늄': 34,
'올립': 69,
'연락처': 9,
'글씨': 11,
'빠릿빠릿': 21,
'마감도': 9,
'비해': 184,
'인하': 23,
'사양': 71,
'장사': 18,
'검정': 9,
'벚꽃': 20,
'구리': 17,
'탈부착': 29,
'어제': 420,
'쓰긴': 11,
'신분증': 24,
'파일': 84,
'베이스': 21,
'마치': 65,
'사신': 33,
'aod': 24,
'g5광각': 10,
'신작': 9,
'후면': 283,
'대해': 106,
'영상': 186,
'개발': 63,
'노트4s': 26,
'교체식': 10,
'생각': 1984,
'반응': 97,
'개통': 684,
'초성': 22,
'비하': 31,
'고장': 90,
'누르': 237,
'지4': 15,
'임대': 17,
'접어': 11,
'대만족': 35,
'미리': 70,
'단차': 447,
'코어': 12,
'보단': 21,
'전산': 14,
'단말기': 89,
'g5사': 51,
'밸런스': 14,
'추가지원': 18,
'파악': 14,
'지역': 36,
'안나': 133,
'베스트샵': 77,
'확장성': 13,
'놀라': 33,
'차량': 23,
'기타': 61,
'pro2': 19,
'주로': 87,
'와이파이': 134,
'이동': 252,
'신호': 13,
'개인': 76,
'더럽': 12,
'벗겨지': 12,
'강하': 34,
'주실': 9,
'the': 414,
'전화기': 21,
'4s': 13,
'아이디': 18,
'wb': 94,
'오늘자': 13,
'mwc': 22,
'다이얼': 15,
'youtube': 76,
'의심': 36,
'방통': 17,
'추억': 10,
'중복': 21,
'최저': 25,
'제발': 74,
'도착': 97,
'단지': 27,
'플립': 11,
'였습니': 20,
'달고': 42,
'세트': 18,
'착탈식': 40,
'휴대용': 12,
'두가': 41,
'나타': 19,
'벤치마크': 19,
'번갈': 16,
'아노다이징': 17,
'이기': 19,
'편안': 24,
'디바이스': 43,
'끼울': 14,
'계열': 10,
'완벽': 71,
'획기적': 11,
's7엣지': 115,
'후에': 67,
'왜곡': 138,
'중고': 261,
'묶이': 14,
'본문': 9,
'감탄': 15,
'띄우': 29,
'시각': 13,
'책상': 9,
'만듦새': 12,
'좌표': 184,
'떨어트': 25,
'신선': 25,
'한창': 10,
'밀어': 12,
'모아': 19,
'신품': 17,
'하진': 9,
'64g': 35,
'지문': 253,
'날짜': 31,
'미루': 15,
'측면': 72,
'함정': 31,
'형광등': 10,
'단독': 11,
'해석': 15,
'차별화': 13,
'입히': 22,
'청구': 100,
'그날': 11,
'타임': 34,
'쓸모': 16,
'보도': 11,
'기술적': 17,
'가루': 9,
'가정': 30,
'힘들': 229,
'필요도': 10,
'군요': 12,
'인상적': 16,
'뉴스': 32,
'정도': 930,
'집사람': 15,
'베터리': 130,
'당연': 136,
'표현': 54,
'절연': 21,
'on': 96,
'넥서스': 51,
'각설': 12,
'통해': 46,
'마무리': 18,
'못쓰': 62,
'자신': 24,
'흔적': 11,
'물론': 32,
'계속': 59,
'바래': 22,
'일도': 11,
'g5쓰': 9,
'항목': 10,
'그게': 98,
'편입': 14,
'음량': 22,
'80l000alus': 14,
'돌고': 11,
'여태': 43,
'터치': 172,
'후반': 20,
'종종': 19,
'g580': 35,
'단어': 20,
'훌륭': 41,
'갤럭키': 142,
'당시': 57,
'신용카드': 15,
'여름': 31,
'인생': 10,
'주변부': 15,
'16gb': 11,
'good': 30,
'대략': 87,
'한명': 15,
'다양': 112,
'구라베젤': 31,
'효율': 52,
'5s': 42,
'저조': 81,
'ㄷㄷㄷㄷㄷ': 14,
'비닐': 16,
'59요금제': 80,
'멈추': 42,
'검수': 13,
'들림': 22,
'늘리': 14,
'유격때문': 10,
'메세지': 17,
'몰레': 192,
'판매자': 62,
'기본기': 17,
'뒷부분': 11,
'두시': 11,
'기울': 23,
'저번주': 27,
'심해': 48,
'내방': 63,
'밴드': 246,
'타격': 18,
'제일': 206,
'이벤트': 299,
'히트': 22,
'아마존': 21,
'어떤': 115,
'여행가': 16,
'쾌적': 31,
'기계': 140,
'지금까지': 105,
'고속': 90,
'자꾸': 140,
'언제쯤': 28,
'그동안': 68,
'전후': 15,
'상품': 67,
'여유': 22,
'착탈': 23,
'판단': 54,
'일어나': 16,
'ㅅㅋㄱㅂ': 138,
'시세가': 13,
'반대편': 16,
'올려놓': 13,
'사람인': 9,
'하니': 183,
'무시': 27,
'월드': 13,
'대충': 111,
'재생': 101,
'시리즈': 117,
'미지원': 17,
'업자': 31,
'못가': 15,
'by': 37,
'대용량': 18,
'안계': 9,
'전에': 45,
'망해': 10,
'둘째': 14,
'화요일': 16,
'말도': 23,
'계신': 147,
'버전': 79,
'깜빡': 16,
's6엣지': 14,
'정지': 14,
'요금': 733,
'사용환경': 11,
'공감': 12,
'아쉬운점': 19,
'바깥': 9,
'이동하': 18,
'무언': 19,
'안녕': 236,
'분할': 12,
'좋긴': 23,
'신박': 14,
'이만': 11,
'알지': 18,
'짜증': 39,
'한해': 12,
'동그라미': 12,
'lg꺼': 10,
'일요일': 19,
'분이': 77,
'용도': 51,
'순정': 29,
'사용': 2178,
'분야': 10,
'자전거': 20,
'나오는거': 20,
'유격': 1077,
'오프라인': 38,
'중심': 22,
'i3': 41,
'구매가': 23,
...}
In [9]:
# check vocabulary in TDM
print(len(tfidf_corpus.vocabulary_))
print(len(tfidf_corpus.get_feature_names()))
tfidf_vectorizer.get_feature_names()[1000:1010]
3578
3578
Out[9]:
['대폭', '대하', '대학생', '대한민국', '대해', '대형', '대화면', '댓글', '더럽', '더불']
In [10]:
# frequency count
tfidf_word_sum = tfidf_bow.toarray().sum(axis=0)
tfidf_word_name = tfidf_corpus.get_feature_names()
tfidf_word_dict = {}
for i in range(len(tfidf_word_sum)):
tfidf_word_dict[tfidf_word_name[i]] = tfidf_word_sum[i]
print(tfidf_corpus.get_feature_names()[150:160])
print(tfidf_word_sum[150:160])
['g5쪽', 'g5출시', 'g5카메', 'g5하', 'g5후', 'g6', 'g7', 'g7x', 'galaxy', 'gk']
[ 1.9353321 4.90042073 3.53566612 2.89852336 2.48341187
11.61954354 2.57126653 2.2969142 7.82942507 2.74237528]
In [11]:
# word & index number
tfidf_word_dict
Out[11]:
{'기록': 3.059014735227978,
'계약서': 4.0762238074865245,
'마시': 2.9296344494642166,
'그래픽카드': 7.4830303723052323,
'고민': 102.47237899899717,
'괜찮다': 4.5461563979857695,
'나을': 15.346104063990092,
'옮기': 17.092562774668181,
'안타깝': 6.0284246014505154,
'선택지': 4.8311254278910321,
'광주': 5.056074283813941,
'철회': 4.129889748313043,
'문의': 24.092461171219675,
'노트북': 17.100678766342558,
'화이트': 10.765715374793414,
'유플러스': 19.486255689020755,
'일하': 4.0667203292007956,
'기본': 46.679144353133104,
'구매자': 9.6881068522525364,
'하이마트': 26.667847797146177,
'홍보': 7.1818885912708899,
'ㅌㅋㄴ': 6.6258725718419837,
'64gb': 4.2459870638483741,
'ls2d': 6.6553979743928222,
'경기': 3.1890090523074286,
'위치': 11.445960340792681,
'고정이': 2.1158070432499416,
'먼지': 8.8844900155381872,
'메뉴': 6.1647463294687954,
'국내': 25.506655312012178,
'편한': 5.2286975301955447,
'평소': 10.478068323474037,
'싼맛': 1.7639435792524509,
'손가락': 6.2607914808829337,
'날씨': 12.575086311364982,
'입체감': 1.3689955551571156,
'화웨이': 6.5666922590602104,
'붙이': 14.427491654970744,
'효도': 2.3592703402753004,
'시계': 5.432995293830591,
'방진': 2.3526061393490409,
'분리형': 3.3640729137467362,
'만나': 7.3044262211203321,
'a7': 10.97119226959974,
'좋을거': 4.138639515872736,
'클리앙': 1.6604967304434213,
'인터파크': 4.0736909979458149,
'찍기': 2.7722655778636534,
'답답': 12.088134840725621,
'매달': 7.292020273741513,
'무겁': 5.5223640826357849,
'퀵커버': 2.3533250022721428,
'전국': 2.0452418369500038,
'의향': 1.3658346647091231,
'mode': 15.129171557751288,
'정리': 10.410856718465597,
'국민': 3.3550456986007418,
'일주일': 15.628509247374488,
'특별': 5.3109853183621389,
'묻히': 1.9440075174822016,
'웹서핑': 12.613364915892967,
'튼튼': 4.8099312120365258,
'점이': 6.2351153379022781,
'dslr': 5.0768768875500694,
'가죽': 4.3527763707368283,
'대부분': 15.856878258692657,
'높다': 2.2402374797526634,
'작업': 5.296182583293044,
'예약': 26.358424331707003,
'진심': 2.7084638903556089,
'사과': 5.738055352628292,
'칩셋': 1.3101767921621585,
'cat6': 4.0643718427905364,
'택배': 24.434754515441231,
'어둡': 16.086038977647465,
'노트5': 35.697724202905604,
'이번': 84.202277322137846,
'웨이즈': 8.8986381525346943,
'저번': 6.4505360460927825,
'특성': 1.9680512346178081,
'지난번': 2.3914644125504112,
'파손': 5.2312381450277261,
'모듈빼': 1.9855136500696458,
'채우': 6.1620886317620984,
'kt': 63.043686455350382,
'단말': 3.3530491478246973,
'사업부': 2.6493502423743562,
'안되는거': 4.6859282516575753,
'게다': 7.6929127061870188,
'구해': 6.4224290226414942,
'조립': 3.8997316730024276,
'장기적': 1.8785343530519556,
'거리': 9.6646162683667534,
'탐나': 5.8791890061248129,
'ㅂㅇ': 20.241717745595089,
'내야': 4.6512649659288963,
'달라': 14.269844698052903,
'가입': 19.52528898643051,
'판매': 22.446515501285798,
'해상도': 12.085334465001171,
'되지': 11.705017838827917,
'통하': 8.8342980102734465,
'가기': 7.661232171074456,
'이럴': 2.0559414311596047,
'판정': 4.4303538168013059,
'반응속도': 2.195279803076656,
'저가형': 3.4106132731493992,
'정상적': 4.845070271566577,
'유투브': 6.7247899589042106,
'적응': 12.247960201409175,
'절약': 1.6525809510120983,
'ㄷㄷ': 24.042846161248338,
'학생': 2.3715838729400605,
'인봉': 7.6781111005360065,
'광탈': 8.2732115791158414,
'활성화': 4.2297387379128164,
'은거': 2.6343312270064452,
'스마트워치': 2.0396082022498185,
'이부': 4.14363858723768,
'안주': 2.9500409018972347,
'스냅': 11.727092387170662,
'길이': 6.3785474650822733,
'조사': 2.4707506565298409,
'마구': 1.9315958276384104,
'인해': 7.7374005271496973,
'techholic': 0.45500840529409387,
'다시': 62.368378974545621,
'싸구려': 2.1132147962552197,
'ㅅㄷㄹ가': 3.6182793518877432,
'감기': 2.1363790044877597,
'대기': 9.7336177943400788,
'하도': 6.7622284520633604,
'하필': 2.1161258146394619,
'가격대': 11.969139229260124,
'단통': 2.056116918173204,
'빠르': 24.610393207834363,
'없애': 9.25012771734289,
'신경': 16.727798336418417,
'귀찮': 19.445123786029132,
'절대': 10.912932702662554,
'자극': 1.4951740727778311,
'이르': 9.0376098530492275,
'편차': 1.5179976015106686,
'사실상': 7.6569268459121149,
'작동': 10.53866685383548,
'바뀌': 16.057759889776182,
'교품증': 26.925552127779131,
'한마디': 2.1996089533510967,
'후속': 4.9901135545594091,
'취소': 10.815306250411934,
'아그': 1.7444759026824372,
'제이슨': 3.6959694423904841,
'다운': 10.826041057338221,
'생김': 2.0540486306694596,
'수채화': 5.33281812109509,
'중간중간': 2.3564377117181001,
'인상': 1.996154407532795,
'받으': 1.848583484342212,
'갤수육': 3.6130342710019492,
'내년': 4.6997852944708383,
'운영': 2.50863043767542,
'audio': 1.6382541093688254,
'정착': 1.7589886971405753,
'모듈': 217.49001232419968,
'처리': 7.3844043450368018,
'개취': 5.4656653048316022,
'확대': 4.8728240605925812,
'점유율': 4.1920492085250247,
'어머니': 17.307428494960032,
'현존': 2.3361778907093957,
'물리': 8.4185299957335005,
'서류': 3.7982830896194431,
'방출': 2.5095244212561485,
'못생기': 3.106950299138179,
'세로': 3.625127008137246,
'현명': 2.4844822793658974,
'겨울': 1.8077310615527276,
'각종': 4.1194577356536533,
'땡기': 9.5034462340998136,
'공간': 3.4171809544529723,
'총평': 1.3082871647636123,
'인간': 1.2948791398607591,
'해주시': 5.5675150605181285,
'생길': 5.0117510854312268,
'사무실': 6.0509832041422245,
'규모': 1.5697234870288841,
'오더': 1.6356714575438365,
'설명': 12.324677320333798,
'드릴': 4.6105538290674266,
'sid2': 6.9609510213198131,
'알아보다': 4.3092581109178507,
'마감': 17.576829315081536,
'발급': 5.212865068469835,
'보다': 8.5249505565370356,
'사은품': 30.495346821366397,
'장소': 2.7729265830823038,
'다음날': 3.9539775121939917,
'빠르다': 1.7565283280040924,
'심지': 6.0507970290133484,
'실수': 7.50941959608181,
'그럼': 2.7814992884361782,
'발표': 14.819895712705144,
'번이': 53.614699049721338,
'옵션': 5.9805756925181868,
'울산': 3.5604469259914491,
'남지': 1.8619178825380287,
'하단부': 8.2619275008870829,
'광각': 72.577668205874346,
'하나하나': 1.9707121379664148,
'아저씨': 2.3522318134518421,
'하이': 2.0154246600262904,
'각도': 2.8059071761997378,
'보스': 1.9707738268894814,
'매년': 1.752124328372167,
'갤6': 18.660673396416637,
'스타': 3.2591429388288558,
'어떠': 11.909416883044065,
'안정감': 1.7636348279340597,
'망작': 1.7380766646197952,
'빠릿': 3.7410510517091731,
'펌웨어': 3.9187736091902989,
'일반인': 5.5565703028519877,
'꺼지': 10.413155682730824,
'불가능': 8.5205571697127702,
'쓰지': 5.560748439255808,
'아침': 11.860072366958502,
'주지': 3.4125934096580823,
'탑재': 10.558083417923113,
'물품': 2.1487920816398716,
'오늘': 118.85004904096492,
'전화로': 1.7971742259529688,
'부2': 8.9135793583892529,
'하지않': 1.7633707094409186,
'먼저': 12.364619790387753,
'프렌즈': 14.885212210886445,
'내면': 5.4380537661287862,
'근래': 2.7437522824671006,
'주면': 3.099168542554172,
'넘사벽': 3.3915724337714037,
'ㄹㄱㅂㅇ': 25.485513942197315,
'인정': 5.0524167875603272,
'과장': 1.8739194839139006,
'마마무': 4.5925304757061474,
'산지': 6.1697936409971508,
'동생': 9.8421488557346422,
'감감무소식': 2.1117391447782818,
'양품': 34.073231170280245,
'변경': 20.551512963409532,
'하시': 25.12844504329135,
'하이브리드': 2.3450168849674782,
'a4용지': 2.9599309877623101,
'국민카드': 4.5872520283019291,
'edge': 5.895874296503389,
'케이스': 80.554470532013411,
'백그라운드': 2.6464112078183937,
'이것이': 1.7767137910318078,
'사놓': 2.6939722424157808,
'등에': 2.1094351622311578,
'현실': 7.7603935384872536,
'프로세서': 2.1681882341584138,
'g540': 3.3990624157568785,
'각각': 4.202208379112836,
'qc': 11.311716411781475,
'중요시': 2.1448651639581744,
'galaxy': 7.8294250688893268,
'그정': 4.9694272795802608,
'6in': 2.7872005380767932,
'htc': 5.6484110714511937,
'고정': 7.3624367271707136,
'유리': 8.3530998389982241,
'채택': 4.4885372425835266,
'목적': 3.9158570533990473,
'롤링': 4.6219246880283942,
'삼성': 87.32090518082903,
'ㅅㅋ': 15.444568758899699,
'스크린': 5.6999344117973747,
'욕심': 4.3939917308501348,
'그림': 2.7712591185266544,
'덕분': 7.2702313050249963,
'중요': 13.174137219412573,
'초중반': 2.5505384376299043,
'밤에': 5.4611365790196107,
'지속적': 1.3688469021360727,
'아내': 2.3008189162606332,
'9ghz': 2.2476808154259982,
'신경안쓰': 2.7768593106373696,
'메모리': 11.864199452648382,
'모델명': 2.6508505445778674,
'노이즈': 8.243884175621961,
'보다보': 6.0515852651238049,
'인수': 2.4527583818264671,
'고르': 10.596824726485844,
'나오길': 3.8786914974221154,
'남자': 8.9464811500125379,
'물에': 2.7295098264224031,
'아식스': 3.4038388563469764,
'글쓰': 4.271713249084601,
'고집': 3.1867634711598836,
'반해': 1.8570584141507782,
'이젠': 8.770766557681382,
'방식': 14.473926214967964,
'사정': 1.670850710749568,
'은근': 7.1576480504665376,
'단통법': 19.196687454956887,
'보여주': 8.0223167650298759,
'외관': 14.24540743236542,
'오고': 4.4060994739689985,
'저장': 6.6155287420546962,
'위하': 15.628175184183826,
'연락': 16.437649714644714,
'지우': 3.6611491368007028,
'때까지': 1.3910457246632868,
'아가': 2.6369616350641296,
'대화면': 1.9590814778927006,
'신경쓰': 11.713518386072931,
'커지': 2.9758692081354381,
'서비스': 15.278969899220487,
'용어': 3.9121775304621198,
'얼른': 6.5995656070675963,
'감사': 18.668453726497539,
'차후': 3.2839846755920523,
'기업': 6.4515312859338092,
'2년정': 2.07985234659836,
'오해': 2.5589205563415889,
'상상': 4.5098141877009388,
'부무': 11.213921205648717,
'완납': 9.9755919717255779,
'서랍': 1.9599587452586675,
'업무': 3.6750114032942642,
'해외': 15.066721022123183,
'올라가': 4.303729735738302,
'현완': 14.915754438622583,
'귀가': 3.0144156117923306,
'치명적': 5.3613657114527324,
'이거': 59.871076015117779,
'128gb': 1.5526004118952574,
'하반기': 3.2933166786964234,
'상태': 27.482183618173764,
'안정성': 2.0716494268013346,
'커뮤니티': 3.0982489925985339,
'엄청': 45.087365676410563,
'뒤지': 2.6295998355465029,
'현실적': 1.403352738026999,
'정품': 9.125151319103292,
'갤s7': 17.1263971225879,
'url': 2.2908471949412963,
'선보': 2.7014289678498464,
'sd': 13.22121583862824,
'보류': 2.0134242113042511,
'일일이': 1.6788668679477432,
'114': 2.2489271596350617,
'인터페이스': 2.1877757696010187,
'마케팅': 17.712050053656728,
'이어폰': 58.730097010610727,
'분위기': 9.7986351266857472,
'정확': 12.979621667828395,
'맨날': 3.8774642613101329,
'저항': 2.6959903670863148,
'유지': 26.455371730305139,
'원활': 2.4067119294039365,
'팍팍': 2.7604613095114781,
'다가오': 3.4528821025130014,
'서비스센터': 24.548211972935345,
'be': 5.388858368427532,
'떨어지': 18.210391784968483,
'착한': 2.5023498460849014,
'하루': 19.318029614472596,
'해결방법': 2.5139469931465368,
'트렌드': 2.0768476357331065,
'기왕': 1.871493742947866,
'극복': 1.8116715801756638,
'네트워크': 1.6482458682085441,
'vga': 2.3001823733602289,
'리모콘': 2.2015792664162785,
'현대': 2.930225646176885,
'단점': 22.866477873724982,
'폰값': 2.3309296998486824,
'오류': 9.3165497856506576,
'난리': 5.7838899490198799,
'등록': 7.2258667036878279,
'계기': 1.8091654977152987,
'개인적': 34.832867527669215,
'naver': 25.232449107005369,
'for': 3.623037907262761,
'5se': 2.9961487719469515,
'laptop': 4.2299158988759213,
'중앙': 2.0394096412798581,
'자체': 22.683960882631549,
'연결': 37.188992478583366,
'여론': 1.7813613487268656,
'스토어': 1.4633626602071579,
'중국': 12.218633253798316,
'앰프': 2.0188167603068092,
'유선': 4.6375889950475591,
'눈팅': 12.497774781165853,
'electronics': 5.9088553904621888,
'완전체': 1.6781593831226143,
'어짜피': 5.8054650881899459,
'구경': 10.407416191772464,
'방탄': 2.2385613460144604,
'베샵': 6.2372694709346028,
'인거': 4.8143184340887641,
'귀엽': 4.1987043976935148,
'카드결제': 4.0271065327153623,
'하나': 39.334851043532112,
'적지': 1.8752199948368542,
'기사': 8.5237784826353664,
'포함': 14.08536911861944,
'건너': 2.1201580748354263,
'정보공유': 2.5365324309919992,
'구글': 10.362583339004102,
'어려': 3.4565687210701102,
'해당': 9.5433208476167852,
'asrock': 1.9279887694599425,
'잠금': 5.3635489094234057,
'나타내': 4.3966521890352093,
'데이트': 19.650724787419701,
'열흘': 3.0146379380613637,
'지프': 7.7693558024360421,
'대세': 5.4955872257658038,
'나르': 28.123380703084585,
'허허': 4.9848275527299961,
'노트3': 12.918958635532704,
'빠릿하': 2.3669622177006082,
'겔럭시': 2.5740719507629239,
'플레이': 7.1249312500380331,
'sw': 1.4263747455165994,
'발생': 16.351719233426639,
'지네': 2.1387429392608457,
'미디어': 3.6184529297066366,
'지점': 2.812094678380086,
'마트': 2.8340847762792163,
'꺼리': 1.5633813206638139,
'상단': 14.160289100818517,
'돌아가': 7.4089783487610097,
'좋다': 15.302225192610656,
'게임': 43.665520561860255,
'생기': 26.281988617901717,
'분리': 17.178694236962755,
'카드': 26.344464773141439,
'빠지': 13.587959383043684,
'쌓이': 1.8310721133528163,
'예상': 22.148152877352249,
'성향': 2.1356363299605516,
'땡겨': 1.3382667635281975,
'개철': 10.642740857469159,
'제시': 1.4558956549444431,
'샀다': 3.9993765953029161,
'능력': 1.3994032528835449,
'aspx': 3.1134217500744952,
'한데': 19.553885720457274,
'희망': 3.0797441237696503,
'그때': 7.1325732368754418,
'강력': 2.3017433986866793,
'듣기': 2.6635890485202234,
'반대쪽': 2.712918902567782,
'강제': 3.9428462342604051,
'아마': 4.5374480000541872,
'허접': 2.629831499695634,
'lcd': 7.5663073672847112,
'있을까': 29.037860947922713,
'일정': 6.2787122400797379,
'유일': 3.0675856858779302,
'간지': 1.9234558679925935,
'gpro2': 3.8274038809864379,
'롤리팝': 4.1330677239791118,
'주변기기': 5.6738856457221001,
'볼땐': 2.5727646352419349,
'수요': 2.5653428777606826,
'로지텍': 3.4819740974786253,
'구합': 2.3786801434963802,
'가족': 12.069595698744816,
'와중': 3.3403735649386852,
'어떨까': 4.7403787412924183,
'원가': 3.094316556850043,
'흥미': 3.5128640416788204,
'디자인과': 2.9234519565510326,
'순위': 3.9686780063954514,
'결론적': 2.9020086771837508,
'이상은': 3.6044494547806152,
'주변': 10.734787437469324,
'of': 2.4801377692488673,
'검색': 25.839799149721664,
'599요금제': 7.7909246208232394,
'한시': 3.5099318896157499,
'간단': 14.113725299492579,
'쥐5': 5.2753195776744404,
'확정': 4.3733972514905171,
'어이': 1.6402439688946335,
'더하': 5.6482108797892723,
'직영점': 8.0208852009829119,
'부정적': 2.1539148583172074,
'공짜': 11.341236303446486,
'걸치': 0.71404938435427023,
'고치': 4.2463400956706625,
'기회': 6.2299791542631882,
'측정': 9.0655189254327215,
'버벅': 8.4291125722828397,
'장착': 21.028297489550614,
'samsung': 3.0204791180269996,
'스냅드래곤': 4.7684565525076588,
'아니': 110.0693138672469,
'버젼': 2.3819773143310821,
'성공': 14.342890189520325,
'설레': 3.4406238611417739,
'한쪽': 8.9837852980143786,
'남기': 9.9462581597617206,
'잡음': 3.5974391875849756,
'한번': 18.625456844276162,
'이제': 55.547072694354739,
'정책': 26.91757231937639,
'lte': 17.121175449704868,
'유독': 3.4987717913403817,
'저하': 2.2473804275801434,
'버튼': 24.943226680760876,
'단자': 9.0521420350245041,
'자국': 2.9378490467358183,
'현상': 17.745610661649124,
'lte2': 2.5937071055175895,
'재미': 9.1792506580455431,
'패턴': 4.4733592841561718,
'팬택': 4.6138443350003806,
'걸리': 19.57344125351268,
'어서': 4.3129889995059694,
'젠더': 15.115701106973678,
'내일': 43.265778730726034,
'g4': 82.505880415487866,
'내부': 8.1899405683975104,
'방금': 16.156972100468831,
'있는곳': 2.174209600340983,
'차액': 2.6884473022662436,
'좋긴한데': 2.2187290301698379,
'우리나라': 3.502273417518186,
'이전': 19.832563099352583,
'케이블': 14.80954135405211,
'만료': 2.7896919514701248,
'루머': 8.6366858907752508,
'이정': 49.719913602511802,
'그나': 26.647893687295216,
'저작권자': 0.5440204469005413,
'안봐': 2.67954067302166,
'42mm': 2.1982643173520633,
'보조': 4.801077055450433,
'후기': 37.49660533176732,
'롯데': 2.7415529242796777,
'ppl': 6.6946347905039563,
'직장인': 1.5492643589111252,
'의도': 1.6508867267571528,
'보자': 4.0862130069627201,
'시키': 6.1583339583463399,
'사용성': 1.8597163913235426,
'비디오': 4.9534177744256809,
'아쉽': 39.361604817265494,
'구간': 0.88941008423171797,
'적절': 2.230528817286928,
'도료': 1.7951564676753096,
'사용하다': 7.2083003719441772,
'못하': 45.589848827674693,
'여러모': 4.869349865721972,
'줄이': 4.4528238216492113,
'한번더': 2.2652186497703317,
'상승': 5.4477988838357208,
'이후': 26.295464410782092,
'글쎄': 2.2779346602733721,
'무이자': 4.5050267679108709,
'마이크로': 3.6684604037595205,
'싼거': 3.8928494033067098,
'크롬': 20.705942806383412,
'행보': 1.4626032271301583,
'깔리': 1.8962920758618871,
'한손': 8.3609549743799096,
'버벅거': 3.9731931010595232,
'후로': 2.3944222807663835,
'금방': 7.099205017642654,
'결과물': 4.326543802208846,
'encode': 0.85853129842784903,
'버스': 6.9962896143024604,
'진입': 5.0076395558632507,
'f700s': 3.7054906477065739,
'칭찬': 5.0448301963369042,
'신기': 17.771305836021977,
'타입': 4.9672767385194296,
'사운드': 10.116966910812145,
'첨부': 7.4439399351498485,
'떠오르': 3.5410377044452814,
'부럽': 4.7741024718732152,
'난감': 3.0971477820201261,
'수치': 2.7562493612504455,
'대응': 3.3612742884955278,
'처분': 3.5110105297886509,
'os': 6.0159620176861326,
'발전': 7.3855983815260347,
'hdmi': 2.7201245442546353,
'반영': 2.6413705065326338,
'유사': 1.4343415139562712,
'못봐': 2.4371129253209811,
'고장나': 3.4827905527800915,
'카드사': 3.3937593205324226,
'카페': 6.476097018915314,
'하라': 3.8875779930031977,
'5x': 3.9510671188252835,
'오랜만': 17.376585231198291,
'이런거': 9.3126657494644345,
'갈수': 2.8738877064304917,
'레노버': 10.348517748436345,
'밧데리': 10.474885906453464,
'특유': 2.81103873746074,
'난다': 2.2794058178883176,
'운전': 2.3142184894890661,
'일반': 33.495859238356886,
'갤칠': 8.2759788330628812,
'g7x': 2.296914201843463,
'말로': 3.7660585051281279,
'사라지': 5.3976257095624698,
'태블릿': 2.8804135052162931,
'여행': 7.8158070493704441,
'뜨겁': 6.8213535408783246,
'편리': 5.4362216655470306,
'죽이': 2.5807489189035056,
'아이폰7': 6.4459073900500687,
'있다': 32.3831816326759,
'이틀': 7.6213795286156589,
'대상': 4.8416271612497424,
'누나': 3.5810969745351313,
'별차이': 4.4078729960846035,
'compulsory': 5.806457619627893,
'더이': 5.1192238581422655,
'버리': 14.854943138377159,
'비슷': 37.455093506113101,
'끄적': 1.5565437625253697,
'위주': 6.9335207876983374,
'알람': 2.9711872402669517,
'소음': 1.2240980694965871,
'내장': 21.253575526805943,
'여친': 4.7777543380379646,
'넣어주': 2.8641689611814503,
'선택': 70.01610425620072,
'사고': 23.972067960133025,
'참여': 5.8906312065792203,
'지포': 6.3637883052777484,
'소개': 8.3818950860880754,
'상담': 6.0487811950345236,
'필요': 33.33285565725118,
'다음주': 10.02461965287169,
'엄마': 4.9453011872291848,
'아이폰se': 9.3339407798118064,
'침수': 5.4908096120166121,
'불구': 4.8360574970980501,
'떨구': 5.7977409368210928,
'음향': 4.0751623144396385,
'사용기간': 1.9198925034462617,
'ㅋㅌㅂㅇ': 34.816449341024551,
'애초': 6.0117847140293117,
'5v': 2.2298017023148438,
'편하': 22.116196233308937,
'오른쪽': 15.20721192744629,
'화소': 6.6832660603338852,
'이득': 7.4927469641355771,
'짜증나': 6.6755798903121928,
'newsid': 3.0085053067107861,
'보급형': 9.4368935836123828,
'라오': 1.6780567077373709,
'착각': 1.8212967014478723,
'최적화': 12.730444770068656,
'공기계': 18.802755213654699,
'대구': 7.2096995800211481,
'브랜드': 10.23056248757821,
'내용': 14.111635929267539,
'디지털': 2.1675946046223302,
'들르': 2.8420770680940399,
'캡쳐': 6.5390169760065673,
'홈피': 2.4062679149634176,
'같습': 12.050704536797271,
'티탄색상': 3.5201597300669261,
'자세': 15.603623341590296,
'요거': 1.9235727930708011,
'르그번': 4.6641755353002372,
'뷰2': 3.2665756507872072,
'출력': 7.1370515841051922,
'53mm': 2.946401344242497,
'기쁘': 1.8029735406326235,
'이면': 4.9868942437794237,
'예의': 4.6201783740401927,
'다녀오': 2.6060016749928243,
'고질적': 1.9108933700742607,
'네비': 2.2944255856399498,
'마시멜로': 10.460742526805646,
'메인보드': 14.976979410634298,
'꽂히': 4.4572744698943234,
'차량용': 2.5261346689284125,
'촬영': 19.824380535103082,
'뽑기': 14.083872903591987,
'물고': 1.9145155350498375,
'패드': 4.0311819692219952,
'감싸': 1.819519837565601,
'실행': 9.2697153092449511,
'그부분': 2.5658228940394832,
'센스': 2.3106388358429286,
'체험존': 8.9640270877510133,
'잘몰': 2.4761405135753054,
'욕먹': 4.0389758840941958,
'기억': 10.575995431276887,
'호환': 19.01106938776034,
'놔두': 1.294352085756052,
'전원': 31.216452475774474,
'삼성꺼': 4.1083416277262366,
'설정': 25.538463255583434,
'벌어지': 3.1932340564201271,
'아이템': 2.9801585603576344,
'스크래치': 3.7996872996743853,
'h61m': 2.7320591049642795,
'알루미늄': 3.9914364733420182,
'올립': 11.651200132189425,
'연락처': 1.9853608261399334,
'글씨': 1.7726709293603915,
'빠릿빠릿': 3.2158829974424719,
'마감도': 1.6807410925455628,
'비해': 16.928773457789539,
'인하': 5.6387025054765934,
'사양': 10.382208802346682,
'장사': 2.9618974133528084,
'검정': 1.5336499051789547,
'벚꽃': 4.7703428469136169,
'구리': 4.3396540515434063,
'탈부착': 4.5779899519165568,
'어제': 55.905339397234094,
'쓰긴': 2.237396945409277,
'신분증': 4.3222857911357924,
'파일': 10.847968052428863,
'베이스': 1.8455876084002574,
'마치': 7.2903707130194286,
'사신': 8.1773559430042884,
'aod': 3.8527647629207999,
'g5광각': 3.9589046541100235,
'신작': 2.112721730400525,
'후면': 31.463162485722574,
'대해': 11.22905772936274,
'영상': 25.308691706330624,
'개발': 5.8498119212583246,
'노트4s': 5.7826149969823897,
'교체식': 2.7124467606924796,
'생각': 143.97184562519266,
'반응': 14.896237706895947,
'개통': 80.796171303944419,
'초성': 4.1045683628646934,
'비하': 4.2619040695073407,
'고장': 14.950841657415626,
'누르': 25.137317545666285,
'지4': 2.7573363304261655,
'임대': 3.3205475926634662,
'접어': 1.8723946749338272,
'대만족': 5.9466390910017584,
'미리': 9.553042175187759,
'단차': 57.030391042504007,
'코어': 1.2038262444981747,
'보단': 3.1723625349134617,
'전산': 2.6153198198020644,
'단말기': 11.966904188751599,
'g5사': 14.610571069775299,
'밸런스': 2.4465776608704397,
'추가지원': 3.318374536366091,
'파악': 2.4531692023185077,
'지역': 6.0928660453851569,
'안나': 19.920294708395609,
'베스트샵': 15.160853771825904,
'확장성': 3.0553768241193411,
'놀라': 4.7958243583818296,
'차량': 3.6889608103317797,
'기타': 7.4966063996661907,
'pro2': 3.2448118112157922,
'주로': 10.289033952684781,
'와이파이': 20.663545797888272,
'이동': 47.271134522744987,
'신호': 1.4324663890769764,
'개인': 8.5473437802323211,
'더럽': 2.8083358769980413,
'벗겨지': 1.8127491736337689,
'강하': 5.5531295304483876,
'주실': 2.9023622112044705,
'the': 10.571641780604871,
'전화기': 3.2835194321502441,
'4s': 1.7934637411015855,
'아이디': 3.5321928816580694,
'wb': 6.6851255005376764,
'오늘자': 4.2015968746030463,
'mwc': 4.8102502117181514,
'다이얼': 1.8421217580579363,
'youtube': 18.164886150384476,
'의심': 5.599721323087242,
'방통': 3.9035351884443146,
'추억': 1.8382177244434708,
'중복': 4.5113885282017767,
'최저': 4.9188765389032545,
'제발': 14.108314368883127,
'도착': 16.986228848605712,
'단지': 4.6185297051481271,
'플립': 2.2276453540913987,
'였습니': 2.278271215819387,
'달고': 6.056523216438773,
'세트': 2.4491891033152697,
'착탈식': 7.0154899185396777,
'휴대용': 2.2027868803308372,
'두가': 7.1499111726224793,
'나타': 2.1943120582863238,
'벤치마크': 2.765281778057179,
'번갈': 2.7421141358628147,
'아노다이징': 2.8616272547222747,
'이기': 4.4580772659544188,
'편안': 2.6313606482728775,
'디바이스': 2.6563455674638736,
'끼울': 2.290347489714839,
'계열': 2.023929569681246,
'완벽': 9.4136465386519568,
'획기적': 2.6408663063632365,
's7엣지': 18.256155156762151,
'후에': 9.5957486483753218,
'왜곡': 15.487284976996845,
'중고': 33.829387902539118,
'묶이': 2.9117845644378888,
'본문': 1.6622988764010627,
'감탄': 2.1497125965982171,
'띄우': 4.2598884510693189,
'시각': 3.3886360281092944,
'책상': 1.8016791412060349,
'만듦새': 1.4475593137419567,
'좌표': 40.26925466988407,
'떨어트': 5.3266457192336336,
'신선': 5.0990407430590814,
'한창': 1.5051411550329443,
'밀어': 1.8479146615731397,
'모아': 2.8738449287176611,
'신품': 3.1626246021784015,
'하진': 1.8121881169786951,
'64g': 7.7399772882727325,
'지문': 27.588574725175459,
'날짜': 5.099569490806025,
'미루': 3.0936680688488272,
'측면': 7.7173131398906545,
'함정': 6.9084287479389523,
'형광등': 1.9451778451965023,
'단독': 1.9754244267499601,
'해석': 3.7159758031248855,
'차별화': 2.2345395031851591,
'입히': 3.0321614327082309,
'청구': 14.378880849158325,
'그날': 2.5697560235591448,
'타임': 6.2390255967601354,
'쓸모': 3.2225798912676611,
'보도': 2.4735608997559,
'기술적': 2.8520936724804002,
'가루': 2.2703372301918687,
'가정': 4.2413232761107071,
'힘들': 32.595497097644262,
'필요도': 1.1560113450375979,
'군요': 1.8797602658645938,
'인상적': 2.3794338295571054,
'뉴스': 5.7030128839438197,
'정도': 77.797971170013028,
'집사람': 3.003722188920229,
'베터리': 19.760519319223597,
'당연': 18.489301168464412,
'표현': 6.3628649036396441,
'절연': 2.8285091725827898,
'on': 7.2784452136919251,
'넥서스': 7.2584771787623152,
'각설': 1.8409439266488459,
'통해': 4.8510071004465685,
'마무리': 2.7832330001940813,
'못쓰': 10.034049718092961,
'자신': 2.3393589830070214,
'흔적': 2.6875599813501849,
'물론': 4.4141390239170448,
'계속': 8.9603071759190076,
'바래': 4.3036849010769265,
'일도': 1.5834183454481803,
'g5쓰': 2.0342803642713414,
'항목': 1.8582113779790361,
'그게': 12.419967146999777,
'편입': 1.5316852096337512,
'음량': 3.8263343334866899,
'80l000alus': 3.212475482880687,
'돌고': 2.2401097105969301,
'여태': 6.9329850467676728,
'터치': 20.804384560432506,
'후반': 4.5600279922471252,
'종종': 2.978359123983926,
'g580': 6.7855985572336008,
'단어': 3.7247529190460509,
'훌륭': 5.708541570294221,
'갤럭키': 27.467052536817587,
'당시': 6.8564076523901623,
'신용카드': 2.7675040092607959,
'여름': 4.4096716993363581,
'인생': 1.4306957404948222,
'주변부': 1.3828590241888417,
'16gb': 1.519817887750057,
'good': 2.1605577362378874,
'대략': 13.702601283776149,
'한명': 3.0037045215786056,
'다양': 10.28237745745116,
'구라베젤': 8.3418454479577804,
'효율': 5.7694142341193251,
'5s': 6.0425999936922521,
'저조': 11.695479344280155,
'ㄷㄷㄷㄷㄷ': 4.9813700902829767,
'비닐': 3.4210503999103454,
'59요금제': 15.385753633967701,
'멈추': 7.1066711268809808,
'검수': 1.9368491157122432,
'들림': 3.2957730646135679,
'늘리': 2.9206866036031642,
'유격때문': 2.4531361781691832,
'메세지': 2.6819008951163035,
'몰레': 20.77399908223574,
'판매자': 9.8103339783950751,
'기본기': 2.8860524491200037,
'뒷부분': 2.0596118225345492,
'두시': 1.9934696534559309,
'기울': 3.9919589458652314,
'저번주': 6.5130994151058186,
'심해': 8.7736637698652835,
'내방': 12.914567682347272,
'밴드': 36.299120120101406,
'타격': 2.8486071418621477,
'제일': 24.992559751249217,
'이벤트': 45.906233267885334,
'히트': 3.6212376140879794,
'아마존': 4.486738032889737,
'어떤': 25.536814467667249,
'여행가': 3.2383038991735664,
'쾌적': 3.8776126096343391,
'기계': 19.683344980851292,
'지금까지': 14.358557633833817,
'고속': 13.25108467082922,
'자꾸': 19.443937574746091,
'언제쯤': 7.1025220967935399,
'그동안': 8.9940574422983026,
'전후': 2.8355927722100729,
'상품': 8.3106422558199995,
'여유': 3.6307436957597603,
'착탈': 3.6957758605471418,
'판단': 6.8291655635593189,
'일어나': 1.9564593816732607,
'ㅅㅋㄱㅂ': 33.573983928503658,
'시세가': 3.051510739674637,
'반대편': 2.1584673339479905,
'올려놓': 3.0458909949833739,
'사람인': 1.6413800583464662,
'하니': 22.495116143888414,
'무시': 3.5595970040448033,
'월드': 3.2700096254844575,
'대충': 18.842066804856152,
'재생': 12.173103086811359,
'시리즈': 14.641694194845831,
'미지원': 2.2971138148134029,
'업자': 5.1872079555723012,
'못가': 2.8961241016957291,
'by': 4.1634541916519652,
'대용량': 4.5924982781169552,
'안계': 2.4799662593268663,
'전에': 8.2023907493022516,
'망해': 2.3837991981766664,
'둘째': 2.2412812818206964,
'화요일': 2.8815616520873277,
'말도': 4.5417688252386155,
'계신': 27.969248857080906,
'버전': 12.395361270015846,
'깜빡': 3.4800518195869734,
's6엣지': 2.3170427758418626,
'정지': 2.250332667152839,
'요금': 83.657255949289976,
'사용환경': 1.2433281853993017,
'공감': 1.552119059162177,
'아쉬운점': 2.0105284723666901,
'바깥': 2.1400772876892482,
'이동하': 4.0964574233222173,
'무언': 3.0530508191565211,
'안녕': 32.093151849514044,
'분할': 1.8086287569656487,
'좋긴': 3.2418801433324469,
'신박': 4.776194521370126,
'이만': 1.6125449590134167,
'알지': 3.0265907406924271,
'짜증': 6.447933132230931,
'한해': 1.8471092219896084,
'동그라미': 3.4734725611484625,
'lg꺼': 2.0489889615166295,
'일요일': 4.0393714482936076,
'분이': 9.7530012058382791,
'용도': 6.9124548628479081,
'순정': 4.8313450906673312,
'사용': 143.05304496597481,
'분야': 1.5541991823157857,
'자전거': 3.5162796626974653,
'나오는거': 4.5965986068068529,
'유격': 110.69394623277742,
'오프라인': 6.6402598515072242,
'중심': 2.4305379265544653,
'i3': 7.3999415231453822,
'구매가': 4.5305828824458212,
...}
In [12]:
with open('tf.csv', 'w') as f: # Just use 'w' mode in 3.x
w = csv.writer(f, delimiter=',', lineterminator='\n')
for key, value in tf_word_dict.items():
w.writerow([key, value])
with open('tfidf.csv', 'w') as f:
w = csv.writer(f, delimiter=',', lineterminator='\n')
for key, value in tfidf_word_dict.items():
w.writerow([key, value])
In [13]:
tf_cooccur = (tf_bow.T * tf_bow) # co-occurrence matrix in sparse csr format
tf_cooccur.setdiag(0) # fill same word cooccurence to 0
print(tf_cooccur.todense()) # print out matrix in dense format
tf_cooccur
[[0 3 0 ..., 0 0 0]
[3 0 0 ..., 2 1 0]
[0 0 0 ..., 0 0 1]
...,
[0 2 0 ..., 0 0 1]
[0 1 0 ..., 0 0 0]
[0 0 1 ..., 1 0 0]]
Out[13]:
<3578x3578 sparse matrix of type '<class 'numpy.int64'>'
with 3661940 stored elements in Compressed Sparse Column format>
In [14]:
# create co-occurence matrix to pandas dataframe
tf_df = pd.DataFrame(data=tf_cooccur.todense(),
index=tf_corpus.vocabulary_.keys(),
columns=tf_corpus.vocabulary_.keys())
tf_df[0:10]
Out[14]:
ls2d
사과
기회
게다
장기적
사지
판매
파지
광탈
인해
...
들뜨
올라오
나머지
풍경
feat
예민
모듈식
분리
안받
하이엔드
ls2d
0
3
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
사과
3
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
1
2
1
0
기회
0
0
0
0
0
0
0
0
0
0
...
0
0
0
1
0
0
0
0
0
1
게다
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
장기적
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
1
사지
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
2
0
0
판매
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
1
파지
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
1
광탈
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
인해
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
10 rows × 3578 columns
In [15]:
# co-occurence matrix to csv
tf_df.to_csv('tf_cooccur.csv', index=True, sep=';')
In [16]:
# use Korean font, set mask image
font_path = 'C:/Windows/Fonts/NanumBarunGothicBold.otf'
mask_image = np.array(Image.open('D:/Document/project/HYStudy/scripts/[HYStudy 17th] mask_image.jpg'))
# Generate a word cloud image and display
wordcloud = WordCloud(max_font_size=72,
font_path=font_path,
background_color='white',
mask=mask_image).generate_from_frequencies(tf_word_dict)
plt.figure(figsize=(30, 90))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
Content source: shinys825/HYStudy
Similar notebooks: